This markdown uses the spsurvey package developed by Michael Dumelle, Tom Kincaid, Tony Olsen, and Marc Weber at EPA to analyze the results of New York’s spatially-balanced probability-based survey in 2020-2021. The results include categorical condition extent estimates for statewide conditions, cumulative distribution functions for water quality parameters, and comparisons to the National Lakes Assessment (nationally and regionally), New Hampshire and to New York’s historical data (LMAS samples since 2010).
start.time <- Sys.time()
library(tidyverse)
library(huxtable)
library(ggplot2)
library(lubridate)
library(egg)
library(gridExtra)
library(ggpattern)
library(spsurvey) #This code was written for spsurvey v5.0.0
`%!in%` <- Negate(`%in%`)
To run cat_analysis(), the dataframe will need to include columns for the categorical variable you are interested in, weight for each site, xcoordinate and ycoordinate. Optionally, you can also include a subpopulation or stratum column to separate results and site IDs for two-stage analysis (for example, if there are repeated sites).
In this script, weight is adjusted because of the proportional design. In the future, this can be excluded as the design should include weights already.
I am designating relevant variables from the dataset as well as creating new variables: mean dissolved oxygen from 0-2m, epilimentic means for ph and spconductance, total nitrogen and DIN:TP ratio.
# retrieve raw data from database
setwd("~/OneDrive - New York State Office of Information Technology Services/Rscripts/Current")
source("new_database/Reading.LMAS.Data.R")
setwd("~/OneDrive - New York State Office of Information Technology Services/Rscripts/Probability.Sampling/")
rm(list=setdiff(ls(), c('newdata',"start.time")))
#list relevant variables
lab<-newdata %>%
filter(SAMPLE_DATE>'2020-01-01') %>%
mutate(combined=paste(CHARACTERISTIC_NAME,
INFORMATION_TYPE,
RSLT_RESULT_SAMPLE_FRACTION,
sep = "_")) %>%
select(LAKE_HISTORY_ID,
SAMPLE_DATE,
combined,
RSLT_RESULT_VALUE,
RSLT_LABORATORY_QUALIFIER,
RSLT_VALIDATOR_QUALIFIER,
RSLT_PROFILE_DEPTH) %>%
mutate(RSLT_RESULT_VALUE=ifelse(!is.na(RSLT_LABORATORY_QUALIFIER)&(RSLT_LABORATORY_QUALIFIER=="U"|RSLT_LABORATORY_QUALIFIER=="UE"),"0",RSLT_RESULT_VALUE),
RSLT_RESULT_VALUE=as.numeric(RSLT_RESULT_VALUE)) %>%
filter(!is.na(RSLT_RESULT_VALUE),
is.na(RSLT_VALIDATOR_QUALIFIER)|(RSLT_VALIDATOR_QUALIFIER!="R"),
combined %in% c('CHLOROPHYLL A_OW_TOTAL',
'PHOSPHORUS, TOTAL_OW_TOTAL',
"MICROCYSTIN_OW_NA",
"NITROGEN, NITRATE (AS N)_OW_TOTAL",
"NITROGEN, NITRATE-NITRITE_OW_TOTAL",
"NITROGEN, KJELDAHL, TOTAL_OW_TOTAL",
"NITROGEN, TOTAL_OW_TOTAL",
"DISSOLVED OXYGEN_DP_NA",
"TRUE COLOR_OW_TOTAL",
"PH_DP_NA",
"SPECIFIC CONDUCTANCE_DP_NA",
"TEMPERATURE_DP_NA",
"DEPTH, SECCHI DISK DEPTH_SD_NA",
"CALCIUM_OW_TOTAL",
"CHLORIDE_OW_TOTAL",
"CHLOROPHYLL A (PROBE) CONCENTRATION, CHLOROPHYTE (GREEN ALGAE)_OW_NA",
"CHLOROPHYLL A (PROBE) CONCENTRATION, CRYPTOPHYTA (CRYPTOPHYTES)_OW_NA",
"CHLOROPHYLL A (PROBE) CONCENTRATION, CYANOBACTERIA (BLUEGREEN)_OW_NA",
"CHLOROPHYLL A (PROBE) CONCENTRATION, DINOPHYTA (DIATOMS)_OW_NA",
"CHLOROPHYLL A (PROBE) CONCENTRATION, TOTAL_OW_NA",
"ALKALINITY, TOTAL (AS CACO3)_OW_TOTAL",
"ARSENIC_BS_TOTAL",
"ARSENIC_OW_TOTAL",
"IRON_BS_TOTAL",
"IRON_OW_TOTAL",
"MAGNESIUM_BS_TOTAL",
"MAGNESIUM_OW_TOTAL",
"CARBON, DISSOLVED ORGANIC (DOC)_OW_DISSOLVED",
"NITROGEN, AMMONIA (AS N)_OW_TOTAL")) %>%
select(LAKE_HISTORY_ID,SAMPLE_DATE,combined,RSLT_RESULT_VALUE,RSLT_PROFILE_DEPTH) %>%
distinct(LAKE_HISTORY_ID,SAMPLE_DATE,combined,RSLT_PROFILE_DEPTH,.keep_all = TRUE) %>%
rename(LAKE_ID=LAKE_HISTORY_ID,
chemical_name=combined,
result_value=RSLT_RESULT_VALUE,
profile_depth=RSLT_PROFILE_DEPTH)
# dissolved oxygen:
# keep DO values between 0 and 2m depth
lab2 <- lab %>%
mutate(keep=case_when(
!(chemical_name=="DISSOLVED OXYGEN_DP_NA") ~ "no",
chemical_name=="DISSOLVED OXYGEN_DP_NA" & profile_depth <= 2 ~ "yes",
chemical_name=="DISSOLVED OXYGEN_DP_NA" & profile_depth > 2 ~ "no")) %>%
filter(keep!="no") %>%
select(-keep)
# mean DO for each lake each date
DO <- aggregate(lab2$result_value,list(lab2$LAKE_ID,lab2$SAMPLE_DATE),FUN=mean) %>%
mutate(chemical_name="DISSOLVED OXYGEN_epi") %>%
rename(LAKE_ID=Group.1,
SAMPLE_DATE=Group.2,
result_value=x)
#thermocline
thermocline<-lab %>%
filter(chemical_name=='TEMPERATURE_DP_NA') %>%
mutate(thermocline=NA)
LID<-thermocline$LAKE_ID[1]
date<-thermocline$SAMPLE_DATE[1]
depth<-thermocline$profile_depth[1]
temp<-thermocline$result_value[1]
for(i in seq(nrow(thermocline))){
current<-thermocline[i,]
if(current$LAKE_ID==LID¤t$SAMPLE_DATE==date){
depth=current$profile_depth
if((temp-current$result_value)>1){
thermocline$thermocline[i]<-current$profile_depth
}
temp=current$result_value
}
else{
LID=current$LAKE_ID
date=current$SAMPLE_DATE
depth=current$profile_depth
temp=current$result_value
}
}
thermocline<-thermocline %>% #pull the lowest value for all
group_by(LAKE_ID,SAMPLE_DATE) %>%
mutate(thermocline=min(thermocline, na.rm=T)) %>%
ungroup() %>%
mutate(thermocline=ifelse(thermocline==Inf,NA,thermocline)) %>%
select(LAKE_ID,SAMPLE_DATE,thermocline,profile_depth) %>%
filter(!is.na(thermocline)) %>% #remove those without a thermocline
distinct()
#epilimentic means for ph and spconductance
epi<-lab %>%
inner_join(thermocline,by=c('LAKE_ID','SAMPLE_DATE','profile_depth')) %>%
filter(chemical_name %in% c("PH_DP_NA","SPECIFIC CONDUCTANCE_DP_NA")) %>%
mutate(result_value = ifelse(chemical_name=="PH_DP_NA",(10^result_value),result_value)) %>%
distinct() %>%
arrange(LAKE_ID,SAMPLE_DATE,chemical_name,profile_depth)
epi<-epi %>%
filter(profile_depth<=thermocline) %>%
distinct() %>%
group_by(LAKE_ID,SAMPLE_DATE,chemical_name) %>%
summarize(Mean=mean(result_value,na.rm=TRUE),
n=n()) %>%
filter(n>2) %>%
select(LAKE_ID,SAMPLE_DATE,chemical_name,Mean)
epi<-epi %>%
mutate(Mean=ifelse(chemical_name=="PH_DP_NA",log10(Mean),Mean)) %>%
rename(result_value=Mean) %>%
mutate(chemical_name=case_when(
chemical_name=="PH_DP_NA"~"PH_epi",
chemical_name=="SPECIFIC CONDUCTANCE_DP_NA"~"SPECIFIC CONDUCTANCE_epi"
))
#hypolimnetic means for ph and dissolved oxygen
hypo<-lab %>%
inner_join(thermocline,by=c('LAKE_ID','SAMPLE_DATE','profile_depth')) %>%
filter(chemical_name %in% c("PH_DP_NA","DISSOLVED OXYGEN_DP_NA")) %>%
mutate(result_value = ifelse(chemical_name=="PH_DP_NA",(10^result_value),result_value)) %>%
distinct() %>%
arrange(LAKE_ID,SAMPLE_DATE,chemical_name,profile_depth)
hypo<-hypo %>%
filter(profile_depth>=thermocline) %>%
distinct() %>%
group_by(LAKE_ID,SAMPLE_DATE,chemical_name) %>%
summarize(Mean=mean(result_value,na.rm=TRUE),
n=n()) %>%
filter(n>2) %>%
select(LAKE_ID,SAMPLE_DATE,chemical_name,Mean)
hypo<-hypo %>%
mutate(Mean=ifelse(chemical_name=="PH_DP_NA",log10(Mean),Mean)) %>%
rename(result_value=Mean) %>%
mutate(chemical_name=case_when(
chemical_name=="PH_DP_NA"~"PH_hypo",
chemical_name=="DISSOLVED OXYGEN_DP_NA"~"DISSOLVED OXYGEN_hypo"
))
# read in site data
sites<-read.csv("Probability_Based_Sites_2020_2021.csv")
sites<-sites %>%
rename(siteID=SITE_ID,
xcoord=LON_DD83,
ycoord=LAT_DD83,
Eval_Status=EvalStatus) %>%
filter(Eval_Status!="") %>% #removing sites that we haven't yet evaluated
mutate(Eval_Status=trimws(Eval_Status))
# restrict to only the data in the probability study and spread the data
att<-merge(lab,epi,by=c("LAKE_ID","SAMPLE_DATE","chemical_name"),all=TRUE) %>%
mutate(result_value=ifelse(is.na(result_value.x),result_value.y,result_value.x)) %>%
select(-result_value.x,-result_value.y)
att<-merge(att,hypo,by=c("LAKE_ID","SAMPLE_DATE","chemical_name"),all=TRUE) %>%
mutate(result_value=ifelse(is.na(result_value.x),result_value.y,result_value.x)) %>%
select(-result_value.x,-result_value.y)
att<-merge(att,DO,by=c("LAKE_ID","SAMPLE_DATE","chemical_name"),all=TRUE)%>%
mutate(result_value=ifelse(is.na(result_value.x),result_value.y,result_value.x)) %>%
filter(!chemical_name %in% c("DISSOLVED OXYGEN_DP_NA","PH_DP_NA","SPECIFIC CONDUCTANCE_DP_NA","TEMPERATURE_DP_NA")) %>%
select(-result_value.x,-result_value.y)
att<-merge(att,sites,by=c('LAKE_ID'),all.y = TRUE) %>% distinct()
att<-att %>% spread(chemical_name,result_value,fill = NA)
att<-att %>% filter((Eval_Status=="Target_Sampled"&!is.na(`CHLOROPHYLL A_OW_TOTAL`)&!is.na(`PHOSPHORUS, TOTAL_OW_TOTAL`)) |
(Eval_Status!="Target_Sampled") |
(LAKE_ID %in% c("0703UWBXXX1","0801KAY0984A","1203MET0821","0602LUD0099","0801GUL0969")))
#creating thresholds
att<-att %>%
# remove fields in the sites table that aren't relevant
select(-Accessible,-Comments,-Contact,-STATUS) %>%
# create total nitrogen
mutate(`NITROGEN, TOTAL`=case_when(
!is.na(`NITROGEN, KJELDAHL, TOTAL_OW_TOTAL`) ~
(`NITROGEN, NITRATE-NITRITE_OW_TOTAL`+`NITROGEN, KJELDAHL, TOTAL_OW_TOTAL`),
is.na(`NITROGEN, KJELDAHL, TOTAL_OW_TOTAL`) ~ `NITROGEN, TOTAL_OW_TOTAL`)) %>%
# DIN:TP
mutate(DINTP=`NITROGEN, NITRATE (AS N)_OW_TOTAL`/`PHOSPHORUS, TOTAL_OW_TOTAL`) %>%
# trophic status
mutate(phos_trophic=case_when(
`PHOSPHORUS, TOTAL_OW_TOTAL`<=0.01 ~ "Oligotrophic",
between(`PHOSPHORUS, TOTAL_OW_TOTAL`,0.01,0.02) ~ "Mesotrophic",
`PHOSPHORUS, TOTAL_OW_TOTAL`>=0.02 ~ "Eutrophic")) %>%
mutate(chla_trophic=case_when(
`CHLOROPHYLL A_OW_TOTAL`<=2 ~ "Oligotrophic",
between(`CHLOROPHYLL A_OW_TOTAL`,2,8) ~ "Mesotrophic",
`CHLOROPHYLL A_OW_TOTAL`>=8 ~ "Eutrophic")) %>%
# EPA thresholds
mutate(TP_threshold=case_when(
`PHOSPHORUS, TOTAL_OW_TOTAL`<=0.016 ~ "Good",
between(`PHOSPHORUS, TOTAL_OW_TOTAL`, 0.016, 0.0279) ~ "Fair",
`PHOSPHORUS, TOTAL_OW_TOTAL`>=0.0279 ~ "Poor")) %>%
mutate(TN_threshold=case_when(
`NITROGEN, TOTAL`<=0.428 ~ "Good",
between(`NITROGEN, TOTAL`, 0.428, 0.655) ~ "Fair",
`NITROGEN, TOTAL`>=0.655 ~ "Poor")) %>%
mutate(CHLA_threshold=case_when(
`CHLOROPHYLL A_OW_TOTAL`<=4.52 ~ "Good",
between(`CHLOROPHYLL A_OW_TOTAL`, 4.52, 8.43) ~ "Fair",
`CHLOROPHYLL A_OW_TOTAL`>=8.43 ~ "Poor")) %>%
# microcystin
mutate(microcystin=case_when(
is.na(`MICROCYSTIN_OW_NA`) ~"Non-detect",
`MICROCYSTIN_OW_NA`<8 ~ "Microcystin Detected",
`MICROCYSTIN_OW_NA`>=8 ~ "Most disturbed")) %>%
# dissolved oxygen
mutate(d.oxygen=case_when(
`DISSOLVED OXYGEN_epi`<=3 ~ "Poor",
between(`DISSOLVED OXYGEN_epi`, 3, 5) ~ "Fair",
`DISSOLVED OXYGEN_epi`>=5 ~ "Good")) %>%
#nutrient limitation
mutate(N_LIMIT=case_when(
`NITROGEN, TOTAL`/`PHOSPHORUS, TOTAL_OW_TOTAL`<=10 ~ "N-limited",
between(`NITROGEN, TOTAL`/`PHOSPHORUS, TOTAL_OW_TOTAL`, 10, 20) ~ "Co-limited",
`NITROGEN, TOTAL`/`PHOSPHORUS, TOTAL_OW_TOTAL`>=20 ~ "P-limited")) %>%
#secchi
mutate(secchi=case_when(
`DEPTH, SECCHI DISK DEPTH_SD_NA`<=2 ~ "Eutrophic",
between(`DEPTH, SECCHI DISK DEPTH_SD_NA`, 2, 5) ~ "Mesotrophic",
`DEPTH, SECCHI DISK DEPTH_SD_NA`>=5 ~ "Oligotrophic")) %>%
#zebra mussels
mutate(zebra=case_when(
CALCIUM_OW_TOTAL<=10 ~ "Not susceptible",
between(CALCIUM_OW_TOTAL, 10, 20) ~ "May be susceptible",
CALCIUM_OW_TOTAL>=20 ~ "Highly susceptible")) %>%
#cslap
mutate(conductance=case_when(
`SPECIFIC CONDUCTANCE_epi`<=125 ~ "Soft",
between(`SPECIFIC CONDUCTANCE_epi`, 125, 250) ~ "Average",
`SPECIFIC CONDUCTANCE_epi`>=250 ~ "Hard")) %>%
mutate(color=case_when(
`TRUE COLOR_OW_TOTAL`<=10 ~ "Uncolored",
between(`TRUE COLOR_OW_TOTAL`, 10, 30) ~ "Weak",
`TRUE COLOR_OW_TOTAL`>=30 ~ "High")) %>%
mutate(ph=case_when(
`PH_epi`<=6.5 ~ "Acidic",
between(`PH_epi`, 6.5, 7.5) ~ "~Neutral",
between(`PH_epi`, 7.5, 8.5) ~ "Slightly alk",
`PH_epi`>=8.5 ~ "Highly alk")) %>%
#leech
mutate(leech=case_when(
`PHOSPHORUS, TOTAL_OW_TOTAL`<=0.030 & `TRUE COLOR_OW_TOTAL`<=20 ~ "Blue",
`PHOSPHORUS, TOTAL_OW_TOTAL`>0.030 & `TRUE COLOR_OW_TOTAL`<=20 ~ "Green",
`PHOSPHORUS, TOTAL_OW_TOTAL`<=0.030 & `TRUE COLOR_OW_TOTAL`>20 ~ "Brown",
`PHOSPHORUS, TOTAL_OW_TOTAL`>0.030 & `TRUE COLOR_OW_TOTAL`>20 ~ "Murky"
)) %>%
#chloride
mutate(chloride=case_when(
`CHLORIDE_OW_TOTAL` <= 35 ~ "Low",
between(`CHLORIDE_OW_TOTAL`,35,250) ~ "Medium",
`CHLORIDE_OW_TOTAL` >= 250 ~ "High",
Eval_Status=="Target_Sampled" & is.na(`CHLORIDE_OW_TOTAL`) ~ "Low")) %>%
#chlorophyll
rename(CHLOROPHYTE=`CHLOROPHYLL A (PROBE) CONCENTRATION, CHLOROPHYTE (GREEN ALGAE)_OW_NA`,
CRYPTOPHYTA=`CHLOROPHYLL A (PROBE) CONCENTRATION, CRYPTOPHYTA (CRYPTOPHYTES)_OW_NA`,
CYANOBACTERIA=`CHLOROPHYLL A (PROBE) CONCENTRATION, CYANOBACTERIA (BLUEGREEN)_OW_NA`,
DINOPHYTA=`CHLOROPHYLL A (PROBE) CONCENTRATION, DINOPHYTA (DIATOMS)_OW_NA`,
PROBE_TOTAL=`CHLOROPHYLL A (PROBE) CONCENTRATION, TOTAL_OW_NA`) %>%
mutate(chlorophyte=case_when(
CHLOROPHYTE/PROBE_TOTAL<=0.25 ~ "Low",
between(CHLOROPHYTE/PROBE_TOTAL,0.25,0.5) ~ "Medium",
CHLOROPHYTE/PROBE_TOTAL>=0.5 ~ "High")) %>%
mutate(cryptophyta=case_when(
CRYPTOPHYTA/PROBE_TOTAL<=0.25 ~ "Low",
between(CRYPTOPHYTA/PROBE_TOTAL,0.25,0.5) ~ "Medium",
CRYPTOPHYTA/PROBE_TOTAL>=0.5 ~ "High")) %>%
mutate(cyanobacteria=case_when(
CYANOBACTERIA/PROBE_TOTAL<=0.25 ~ "Low",
between(CYANOBACTERIA/PROBE_TOTAL,0.25,0.5) ~ "Medium",
CYANOBACTERIA/PROBE_TOTAL>=0.5 ~ "High")) %>%
mutate(dinophyta=case_when(
DINOPHYTA/PROBE_TOTAL<=0.25 ~ "Low",
between(DINOPHYTA/PROBE_TOTAL,0.25,0.5) ~ "Medium",
DINOPHYTA/PROBE_TOTAL>=0.5 ~ "High")) %>%
#alkalinity
mutate(alkalinity=case_when(
`ALKALINITY, TOTAL (AS CACO3)_OW_TOTAL` <= 60 ~ "Soft",
between(`ALKALINITY, TOTAL (AS CACO3)_OW_TOTAL`,60,120) ~ "Moderately hard",
between(`ALKALINITY, TOTAL (AS CACO3)_OW_TOTAL`,120,180) ~ "Hard",
`ALKALINITY, TOTAL (AS CACO3)_OW_TOTAL` >= 180 ~ "Very hard"))
# Create a Target/NonTarget status variable
att<-att %>%
mutate(statusTNT="Target",
statusTNT=ifelse(Eval_Status=="NonTarget","NonTarget",statusTNT))
# remove duplicates (multiple samples of same lake)
att <- att %>%
mutate(SAMPLE_DATE = ymd(SAMPLE_DATE)) %>% # convert to date
mutate(month = month(SAMPLE_DATE)) %>% # extract month from date
group_by(siteID) %>%
mutate(has_aug = 8 %in% month) %>% # annotate as having date in august
filter(has_aug & month == 8 |
!has_aug) %>% # filter only august dates from sites with or all from those without
slice_max(n = 1,
order_by = SAMPLE_DATE,
with_ties = F) %>% # change with_ties to TRUE to discard NA values
select(-has_aug)
att<-att %>%
ungroup() %>%
mutate(WgtAdj=case_when(
PROB_CAT=="(1,4]"~(1828/29),
PROB_CAT=="(4,10]"~(2490/15),
PROB_CAT=="(10,20]"~(1003/18),
PROB_CAT=="(20,50]"~(616/20),
PROB_CAT==">50"~(500/33)))
#read in data on excursions
excursions <- read.csv("probability.data.excursions.csv")
excursions <- excursions %>%
select(parameter,n_exceedances,LAKE_ID) %>%
spread(parameter,n_exceedances,fill = 0) %>%
mutate(LAKE_ID=toupper(LAKE_ID)) %>%
mutate(all_params=case_when(
ammonia==0 & chlorophyll_a==0 & dissolved_oxygen==0 & nitrite==0 & ph==0 & phosphorus==0 ~0,
LAKE_ID=TRUE~1
)) %>%
rename(AMMONIA_excursions=ammonia,
DO_excursions=dissolved_oxygen,
NITRITE_excursions=nitrite,
PH_excursions=ph,
PHOSPHORUS_excursions=phosphorus,
Fully_support=all_params)
att<-merge(att,excursions,by="LAKE_ID")
Additionally, I will load in NLA, NAP and NH data:
# read in NLA data
setwd("~/OneDrive - New York State Office of Information Technology Services/Rscripts/Probability.Sampling/")
chem <- read.csv("nla_2017_water_chemistry_chla-data.csv")
chem <- chem %>%
filter(ANALYTE %in% c("NTL","CHLA","PTL","NITRATE_N","COLOR","CHLORIDE","MAGNESIUM","DOC","COND","AMMONIA_N","CALCIUM","PH"),
NARS_FLAG == "") %>%
select(UID,SITE_ID,DATE_COL,ANALYTE,RESULT) %>%
pivot_wider(names_from= ANALYTE ,values_from = RESULT,) %>%
rename(TN=NTL,TP=PTL)
secchi <- read.csv("nla_2017_secchi-data.csv") %>%
mutate(SECCHI=(DISAPPEARS+REAPPEARS)/2,
DATE_COL=mdy(DATE_COL)) %>%
select(UID,SITE_ID,DATE_COL,SECCHI)
oxygen <- read.csv("nla_2017_profile-data.csv")
oxygen <- oxygen %>%
select(UID,SITE_ID,DATE_COL,DEPTH,OXYGEN) %>%
# dissolved oxygen:
# keep DO values between 0 and 2m depth
mutate(keep=case_when(
DEPTH <= 2 ~ "yes",
DEPTH > 2 ~ "no")) %>%
filter(keep=="yes") %>%
select(-keep)
# mean DO for each lake
DO <- aggregate(oxygen$OXYGEN,list(oxygen$SITE_ID,oxygen$DATE_COL),FUN=mean)
# add back in
oxygen <- merge(oxygen,DO,by.x=c("SITE_ID","DATE_COL"),by.y=c("Group.1","Group.2"),all.x=TRUE)
oxygen$x <- as.numeric(oxygen$x)
oxygen <- oxygen %>%
select(-c(DEPTH,OXYGEN)) %>%
rename(OXYGEN=x) %>%
distinct()
att.nla <- merge(chem,oxygen,by=c("UID","SITE_ID","DATE_COL"),all.y=TRUE,all.x=TRUE) %>%
mutate(DATE_COL=dmy(DATE_COL))
att.nla <- merge(att.nla,secchi,by=c("UID","SITE_ID","DATE_COL"),all.x=TRUE)
att.nla[,4:17] <- lapply(att.nla[,4:17],as.numeric)
att.nla$DINTP <- as.numeric(att.nla$NITRATE_N)/(as.numeric(att.nla$TP)/1000)
# selecting parameters and adding trophic status
att.nla<-att.nla %>%
# trophic status
mutate(phos_trophic=case_when(
TP<=10 ~ "oligotrophic",
between(TP,10,20) ~ "mesotrophic",
TP>=20 ~ "eutrophic")) %>%
mutate(chla_trophic=case_when(
CHLA<=2 ~ "oligotrophic",
between(CHLA,2,8) ~ "mesotrophic",
CHLA>=8 ~ "eutrophic")) %>%
# EPA thresholds
mutate(TP_threshold=case_when(
TP<=16 ~ "Good",
between(TP, 16, 27.9) ~ "Fair",
TP>=27.9 ~ "Poor")) %>%
mutate(TN_threshold=case_when(
TN<=0.428 ~ "Good",
between(TN, 0.428, 0.655) ~ "Fair",
TN>=0.655 ~ "Poor")) %>%
mutate(CHLA_threshold=case_when(
CHLA<=4.52 ~ "Good",
between(CHLA, 4.52, 8.43) ~ "Fair",
CHLA>=8.43 ~ "Poor")) %>%
# dissolved oxygen
mutate(d.oxygen=case_when(
OXYGEN<=3 ~ "Poor",
between(OXYGEN, 3, 5) ~ "Fair",
OXYGEN>=5 ~ "Good")) %>%
#leech
mutate(leech=case_when(
TP<=30 & COLOR<=20 ~ "Blue",
TP>30 & COLOR<=20 ~ "Green",
TP<=30 & COLOR>20 ~ "Brown",
TP>30 & COLOR>20 ~ "Murky"
)) %>%
#chloride
mutate(chloride=case_when(
CHLORIDE <= 35 ~ "Low",
between(CHLORIDE,35,250) ~ "Medium",
CHLORIDE >= 250 ~ "High"))
sites <- read.csv("nla_2017_site_information-data.csv")
nap.sites <- sites %>%
mutate(include=case_when(
AG_ECO9=="NAP" & AREA_HA>2.63 ~ "yes",
SITE_ID=TRUE ~ "no"
)) %>%
select(UID,SITE_ID,AREA_CAT6,EVAL_CAT,YCOORD,XCOORD,WGT_TP_EXTENT,include) %>%
filter(WGT_TP_EXTENT != 0)
nla.sites <- sites %>%
mutate(include=case_when(
AREA_HA>2.63 ~ "yes",
SITE_ID=TRUE ~ "no"
)) %>%
select(UID,SITE_ID,AREA_CAT6,EVAL_CAT,YCOORD,XCOORD,WGT_TP_EXTENT,include) %>%
filter(WGT_TP_EXTENT != 0)
nla.att <- merge(att.nla,nla.sites,by=c("UID","SITE_ID"))
nap.att <- merge(att.nla,nap.sites,by=c("UID","SITE_ID"))
nh.att <- read.csv("New_Hampshire_Lakes_2017_Design_Status_20211027_KH_EPA.csv", na.strings=c(""," ","NA"))
# selecting parameters and adding trophic status
nh.att<-nh.att %>%
mutate(include=case_when(
AREA_HA>2.63 ~ "yes",
SITE_ID=TRUE ~ "no"))
chem <- read.csv("BasicNHChem_ForNY.csv")
#merge
nh.att <- merge(nh.att,chem,by.x="SITE_ID",by.y="NLA.ID",all.x=T) %>%
mutate(CHLORIDE=case_when(
CHLORIDE!="<3" ~ as.numeric(CHLORIDE),
CHLORIDE=="<3" ~ 0)) %>%
mutate(CHLORIDE_threshold=case_when(
CHLORIDE<=35 ~ "Low",
between(CHLORIDE,35,250) ~ "Medium",
CHLORIDE>=250 ~ "High"
))
Here I perform the analysis using cat_analysis(). The relevant categorical variable / threshold should be input as the vars argument. vars can be a list of all categories you are interested in, which would create a very big dataframe.
In the future, the weight argument should be part of the site design and might not be called WgtAdj.
#list variables you are interested in and defined above
vars <- c("phos_trophic", "chla_trophic", "TP_threshold", "TN_threshold", "CHLA_threshold", "microcystin", "d.oxygen", "N_LIMIT", "secchi", "zebra", "conductance", "color", "ph", "leech", "chloride","alkalinity","chlorophyte","cryptophyta","cyanobacteria","dinophyta")
#analysis
CatExtent <- cat_analysis(
dframe=att,
vars=vars,
subpops = , #for example could separate by area category or year
siteID = "siteID",
weight = "WgtAdj", #name will probably change in future
xcoord = "xcoord",
ycoord = "ycoord")
table <- CatExtent %>%
select(Indicator,
Category,
Estimate.P,
LCB95Pct.P,
UCB95Pct.P) %>%
filter(Category!="Total") %>%
mutate(Category=factor(Category, levels=c("Poor","Fair","Good",
"Blue","Green","Brown","Murky",
"High","Weak","Uncolored",
"Medium","Low",
"Acidic","~Neutral","Slightly alk","Highly alk",
"Soft","Moderately hard","Average","Hard","Very hard",
"Highly susceptible","May be susceptible","Not susceptible",
"Eutrophic","Mesotrophic","Oligotrophic",
"N-limited","Co-limited","P-limited",
"Non-detect","Microcystin detected","Most disturbed",
"Excursion","Non-excursion")),
Indicator=case_when(
Indicator=="phos_trophic"~"Phosphorus",
Indicator=="chla_trophic"~"Chlorophyll-a",
Indicator=="TP_threshold"~"Total phosphorus",
Indicator=="TN_threshold"~"Total nitrogen",
Indicator=="CHLA_threshold"~"Total chlorophyll",
Indicator=="microcystin"~"Microcystin",
Indicator=="d.oxygen"~"Dissolved oxygen",
Indicator=="N_LIMIT"~"Nutrient limitation",
Indicator=="secchi"~"Secchi",
Indicator=="zebra"~"Zebra mussel susceptibility",
Indicator=="conductance"~"Hardness",
Indicator=="color"~"Color",
Indicator=="ph"~"pH",
Indicator=="leech"~"Nutrient-color status",
Indicator=="chloride"~"Chloride",
Indicator=TRUE~Indicator)
)
hux <- as_hux(table) #I use huxtable because kable doesn't work on my computer but that is also fine
number_format(hux) <- 2
theme_plain(hux)
| Indicator | Category | Estimate.P | LCB95.00Pct.P | UCB95.00Pct.P |
|---|---|---|---|---|
| Phosphorus | Eutrophic | 22.35 | 11.25 | 33.45 |
| Phosphorus | Mesotrophic | 34.33 | 18.35 | 50.31 |
| Phosphorus | Oligotrophic | 43.32 | 28.92 | 57.72 |
| Chlorophyll-a | Eutrophic | 11.43 | 4.97 | 17.89 |
| Chlorophyll-a | Mesotrophic | 69.33 | 56.12 | 82.53 |
| Chlorophyll-a | Oligotrophic | 19.24 | 7.08 | 31.41 |
| Total phosphorus | Fair | 18.31 | 6.53 | 30.09 |
| Total phosphorus | Good | 64.89 | 51.85 | 77.92 |
| Total phosphorus | Poor | 16.81 | 6.06 | 27.56 |
| Total nitrogen | Fair | 14.26 | 5.66 | 22.86 |
| Total nitrogen | Good | 66.79 | 52.56 | 81.02 |
| Total nitrogen | Poor | 18.95 | 5.88 | 32.02 |
| Total chlorophyll | Fair | 34.09 | 17.76 | 50.41 |
| Total chlorophyll | Good | 54.49 | 38.33 | 70.64 |
| Total chlorophyll | Poor | 11.43 | 4.97 | 17.89 |
| Microcystin | Non-detect | 100.00 | 100.00 | 100.00 |
| Dissolved oxygen | Fair | 1.28 | 0.00 | 3.46 |
| Dissolved oxygen | Good | 91.16 | 80.11 | 100.00 |
| Dissolved oxygen | Poor | 7.55 | 0.00 | 18.47 |
| Nutrient limitation | Co-limited | 17.32 | 3.51 | 31.12 |
| Nutrient limitation | N-limited | 0.78 | 0.00 | 2.13 |
| Nutrient limitation | P-limited | 81.90 | 68.01 | 95.80 |
| Secchi | Eutrophic | 46.37 | 31.37 | 61.37 |
| Secchi | Mesotrophic | 48.69 | 33.50 | 63.88 |
| Secchi | Oligotrophic | 4.94 | 0.80 | 9.09 |
| Zebra mussel susceptibility | Highly susceptible | 29.72 | 12.54 | 46.90 |
| Zebra mussel susceptibility | May be susceptible | 14.87 | 0.00 | 31.71 |
| Zebra mussel susceptibility | Not susceptible | 55.41 | 34.80 | 76.02 |
| Hardness | Average | 16.41 | 2.38 | 30.45 |
| Hardness | Hard | 23.19 | 7.91 | 38.48 |
| Hardness | Soft | 60.39 | 44.37 | 76.42 |
| Color | High | 68.64 | 52.20 | 85.07 |
| Color | Uncolored | 8.28 | 0.14 | 16.41 |
| Color | Weak | 23.09 | 10.25 | 35.92 |
| pH | ~Neutral | 47.34 | 30.03 | 64.65 |
| pH | Acidic | 15.26 | 4.42 | 26.10 |
| pH | Highly alk | 15.28 | 1.49 | 29.08 |
| pH | Slightly alk | 22.12 | 10.28 | 33.95 |
| Nutrient-color status | Blue | 21.59 | 7.15 | 36.04 |
| Nutrient-color status | Brown | 54.50 | 34.94 | 74.06 |
| Nutrient-color status | Green | 1.08 | 0.00 | 3.07 |
| Nutrient-color status | Murky | 22.83 | 6.76 | 38.90 |
| Chloride | High | 6.60 | 0.00 | 16.90 |
| Chloride | Low | 77.19 | 64.37 | 90.01 |
| Chloride | Medium | 16.21 | 5.60 | 26.81 |
| alkalinity | Hard | 11.47 | 0.18 | 22.75 |
| alkalinity | Moderately hard | 20.63 | 7.24 | 34.03 |
| alkalinity | Soft | 67.90 | 54.43 | 81.37 |
| chlorophyte | High | 75.45 | 58.60 | 92.30 |
| chlorophyte | Low | 13.42 | 0.00 | 28.43 |
| chlorophyte | Medium | 11.13 | 1.60 | 20.66 |
| cryptophyta | Low | 100.00 | 100.00 | 100.00 |
| cyanobacteria | High | 16.19 | 0.32 | 32.07 |
| cyanobacteria | Low | 65.93 | 47.82 | 84.03 |
| cyanobacteria | Medium | 17.88 | 6.75 | 29.00 |
| dinophyta | High | 2.11 | 0.00 | 5.90 |
| dinophyta | Low | 87.84 | 78.38 | 97.29 |
| dinophyta | Medium | 10.06 | 1.51 | 18.61 |
ny.cat <- table %>% filter(Indicator %in% c("Total phosphorus","Total nitrogen","Total chlorophyll","Dissolved oxygen","Chloride")) %>% mutate(Study="NY")
probability.cat <- table %>% filter(Indicator %in% c("Phosphorus","Chlorophyll-a","Secchi"))
Here I perform the analysis using cont_analysis(). The relevant CONTINUOUS variable should be input as the vars argument. vars can be a list of all categories you are interested in, which would create a very big dataframe with all. The output of this function is a list with 3 estimations: cumulative distribution function (CDF), percentiles, and means.
As above, in the future the weight argument should be part of the site design and might not be called WgtAdj.
#To conduct analysis on a continuous variable, using a new list of CONTINUOUS variables, also note that cont_analysis() doesn't like variable names that have spaces so you will need to rename these. It's helpful to put the units in the name.
att <- att %>% rename("CHLOROPHYLL_ug_L"=`CHLOROPHYLL A_OW_TOTAL`,
"NITROGEN_mg_L"=`NITROGEN, TOTAL`,
"PHOSPHORUS_mg_L"=`PHOSPHORUS, TOTAL_OW_TOTAL`,
"DISSOLVED_OXYGEN_mg_L"=`DISSOLVED OXYGEN_hypo`,
"ALKALINITY_mg_L"=`ALKALINITY, TOTAL (AS CACO3)_OW_TOTAL`,
"CHLORIDE_mg_L"=CHLORIDE_OW_TOTAL,
"CALCIUM_mg_L"=CALCIUM_OW_TOTAL,
"DOC_mg_L"=`CARBON, DISSOLVED ORGANIC (DOC)_OW_DISSOLVED`,
"SECCHI_m"=`DEPTH, SECCHI DISK DEPTH_SD_NA`,
"TRUE_COLOR"=`TRUE COLOR_OW_TOTAL`,
"SPEC_CONDUCTANCE"=`SPECIFIC CONDUCTANCE_epi`,
"AMMONIA_N"=`NITROGEN, AMMONIA (AS N)_OW_TOTAL`,
"NP_ratio"=DINTP)
myvars <- c("CHLOROPHYLL_ug_L","NITROGEN_mg_L","PHOSPHORUS_mg_L","CHLORIDE_mg_L","DISSOLVED_OXYGEN_mg_L","DOC_mg_L","CALCIUM_mg_L","SECCHI_m","TRUE_COLOR","SPEC_CONDUCTANCE","AMMONIA_N","MAGNESIUM_OW_TOTAL","PH_hypo","ARSENIC_BS_TOTAL", "ARSENIC_OW_TOTAL", "IRON_BS_TOTAL", "IRON_OW_TOTAL", "MAGNESIUM_BS_TOTAL", "MAGNESIUM_OW_TOTAL","ALKALINITY_mg_L","NP_ratio")
#Creates 3 estimations in list: CDF, percentiles, means.
analysis <- cont_analysis(
dframe = att,
vars = myvars,
subpops = ,
siteID = "siteID",
weight = "WgtAdj",
xcoord = "xcoord",
ycoord = "ycoord")
NY<-analysis$CDF %>%
select(Indicator,Value,Estimate.P,LCB95Pct.P,UCB95Pct.P) %>%
mutate(Study="NY") %>%
mutate(Value=case_when(
Indicator=="PHOSPHORUS_mg_L"~as.numeric(Value)*1000,
Indicator=TRUE~as.numeric(Value)
)) %>%
mutate(Indicator=case_when(
Indicator=="PHOSPHORUS_mg_L"~"PHOSPHORUS_ug_L",
Indicator=TRUE~Indicator
))
str(analysis)
## List of 3
## $ CDF :'data.frame': 659 obs. of 15 variables:
## ..$ Type : chr [1:659] "All_Sites" "All_Sites" "All_Sites" "All_Sites" ...
## ..$ Subpopulation : chr [1:659] "All Sites" "All Sites" "All Sites" "All Sites" ...
## ..$ Indicator : chr [1:659] "CHLOROPHYLL_ug_L" "CHLOROPHYLL_ug_L" "CHLOROPHYLL_ug_L" "CHLOROPHYLL_ug_L" ...
## ..$ Value : num [1:659] 1.00e-10 4.45e-01 5.50e-01 6.84e-01 1.05 ...
## ..$ nResp : num [1:659] 1 2 3 4 5 6 7 8 9 10 ...
## ..$ Estimate.P : num [1:659] 1.38 2.77 3.45 4.13 4.81 ...
## ..$ StdError.P : num [1:659] 1.22 1.7 1.82 1.95 2 ...
## ..$ MarginofError.P: num [1:659] 2.39 3.32 3.56 3.81 3.93 ...
## ..$ LCB95Pct.P : num [1:659] 0 0 0 0.318 0.886 ...
## ..$ UCB95Pct.P : num [1:659] 3.77 6.09 7.01 7.95 8.74 ...
## ..$ Estimate.U : num [1:659] 30.8 61.6 76.8 91.9 107.1 ...
## ..$ StdError.U : num [1:659] 27 37.4 39.6 41.9 42.6 ...
## ..$ MarginofError.U: num [1:659] 52.9 73.4 77.5 82.1 83.5 ...
## ..$ LCB95Pct.U : num [1:659] 0 0 0 9.76 23.54 ...
## ..$ UCB95Pct.U : num [1:659] 83.7 135 154.3 174 190.6 ...
## $ Pct :'data.frame': 147 obs. of 10 variables:
## ..$ Type : chr [1:147] "All_Sites" "All_Sites" "All_Sites" "All_Sites" ...
## ..$ Subpopulation: chr [1:147] "All Sites" "All Sites" "All Sites" "All Sites" ...
## ..$ Indicator : chr [1:147] "CHLOROPHYLL_ug_L" "CHLOROPHYLL_ug_L" "CHLOROPHYLL_ug_L" "CHLOROPHYLL_ug_L" ...
## ..$ Statistic : chr [1:147] "5Pct" "10Pct" "25Pct" "50Pct" ...
## ..$ nResp : num [1:147] 5 9 15 26 33 45 48 4 4 10 ...
## ..$ Estimate : num [1:147] 1.08 1.08 1.08 1.08 1.08 ...
## ..$ StdError : num [1:147] 0.411 0.357 0.433 0.586 1.209 ...
## ..$ MarginofError: num [1:147] 0.805 0.7 0.848 1.148 2.369 ...
## ..$ LCB95Pct : num [1:147] 0 0.513 1.564 2.542 4.584 ...
## ..$ UCB95Pct : num [1:147] 1.65 1.94 3.3 4.89 9.43 ...
## $ Mean:'data.frame': 21 obs. of 9 variables:
## ..$ Type : chr [1:21] "All_Sites" "All_Sites" "All_Sites" "All_Sites" ...
## ..$ Subpopulation: chr [1:21] "All Sites" "All Sites" "All Sites" "All Sites" ...
## ..$ Indicator : chr [1:21] "CHLOROPHYLL_ug_L" "NITROGEN_mg_L" "PHOSPHORUS_mg_L" "CHLORIDE_mg_L" ...
## ..$ nResp : int [1:21] 55 47 60 55 33 41 33 59 35 38 ...
## ..$ Estimate : num [1:21] 5.0524 0.5559 0.0186 44.8451 4.8785 ...
## ..$ StdError : num [1:21] 0.45911 0.08874 0.00235 18.67918 0.74625 ...
## ..$ MarginofError: num [1:21] 0.8998 0.1739 0.0046 36.6105 1.4626 ...
## ..$ LCB95Pct : num [1:21] 4.153 0.382 0.014 8.235 3.416 ...
## ..$ UCB95Pct : num [1:21] 5.9522 0.7298 0.0232 81.4556 6.3411 ...
Here I have plotted the analyses from above. The categorical variables are plotted as dots, maps and bars; the continuous variables are plotted as a distribution function. They are also grouped by QAPP parameter categories in the “Grouped plots” tab.
plot<-ggplot(table,aes(x=Category,y=Estimate.P)) +
geom_point()+
geom_errorbar(aes(ymin=LCB95Pct.P,ymax=UCB95Pct.P),width=0.2)+
theme(legend.position = "none")+
facet_wrap(.~Indicator,scales = "free")+
ylim(0,100)+
labs(title="Condition estimates across NYS Ponded Waters",y="Percent of Total",x="Condition category")+
theme(axis.text.x = element_text(angle = 45,vjust = 1, hjust=1))
plot <- set_panel_size(p = plot, width = unit(4, "cm"), height = unit(4,"cm"))
gridExtra::grid.arrange(plot)
#To plot a cumulative distribution function:
plot <- ggplot(analysis$CDF,aes(x=Value,y=Estimate.P,ymin=LCB95Pct.P,ymax=UCB95Pct.P))+
geom_line()+
geom_point()+
geom_ribbon(alpha=0.5)+
ylim(0,100)+
facet_wrap(.~Indicator, scales="free")+
guides(fill=FALSE,shape=FALSE)+
labs(y="Percent of lakes")
plot <- set_panel_size(p = plot, width = unit(4, "cm"), height = unit(4,"cm"))
gridExtra::grid.arrange(plot)
#fetch map
library(ggmap)
library(maps)
states <- map_data("state")
ny <- subset(states, region %in% c("new york"))
att$CHLA_threshold <- factor(att$CHLA_threshold, levels=c("Good","Fair","Poor"))
ggplot() +
geom_polygon(data=ny,aes(x = long, y = lat, group = group), color = "white") +
coord_fixed(1.3) +
geom_point(data=att,aes(x=xcoord,y=ycoord,color=CHLA_threshold))+
guides(fill=FALSE)+
theme(panel.border = element_blank(), panel.grid.major = element_blank(),
panel.grid.minor = element_blank(), axis.line = element_line(colour = "black"))
#list variables you are interested in and defined above
trophic <- c("phos_trophic","chla_trophic","N_LIMIT","secchi","leech","color")
minerals <- c("zebra","alkalinity")
in.situ <- c("ph","conductance")
habs <- c("microcystin","chlorophyte","cryptophyta","cyanobacteria","dinophyta")
fishing <- c("AMMONIA_excursions","DO_excursions","NITRITE_excursions","PH_excursions","Fully_support")
vars.list <- list(Trophic=trophic,Minerals=minerals,`In Situ`=in.situ,HABs=habs,`Fishing use`=fishing)
n<-0
#analysis
for(i in vars.list){
n<-n+1
CatExtent <- cat_analysis(
dframe=att,
vars=i,
subpops = , #for example could separate by area category or year
siteID = "siteID",
weight = "WgtAdj", #name will probably change in future
xcoord = "xcoord",
ycoord = "ycoord")
table <- CatExtent %>%
select(Indicator,
Category,
Estimate.P,
LCB95Pct.P,
UCB95Pct.P) %>%
filter(Category!="Total") %>%
mutate(Category=case_when(
Category==0 ~ "Non-excursion",
Category==1 ~ "Excursion",
Category=TRUE~Category
)) %>%
mutate(Category=factor(Category, levels=c("Poor","Fair","Good",
"Blue","Green","Brown","Murky",
"High","Weak","Uncolored",
"Medium","Low",
"Acidic","~Neutral","Slightly alk","Highly alk",
"Soft","Moderately hard","Average","Hard","Very hard",
"Highly susceptible","May be susceptible","Not susceptible",
"Eutrophic","Mesotrophic","Oligotrophic",
"N-limited","Co-limited","P-limited",
"Non-detect","Microcystin detected","Most disturbed",
"Excursion","Non-excursion")),
Indicator=case_when(
Indicator=="phos_trophic"~"Phosphorus",
Indicator=="chla_trophic"~"Chlorophyll-a",
Indicator=="TP_threshold"~"Total phosphorus",
Indicator=="TN_threshold"~"Total nitrogen",
Indicator=="CHLA_threshold"~"Total chlorophyll",
Indicator=="microcystin"~"Microcystin",
Indicator=="d.oxygen"~"Dissolved oxygen",
Indicator=="N_LIMIT"~"Nutrient limitation",
Indicator=="secchi"~"Secchi",
Indicator=="zebra"~"Zebra mussel susceptibility",
Indicator=="conductance"~"Conductance",
Indicator=="color"~"Color",
Indicator=="ph"~"pH",
Indicator=="leech"~"Nutrient-color status",
Indicator=='alkalinity'~"Alkalinity",
Indicator=="chlorophyte"~"% Chlorophyte",
Indicator=="cryptophyta"~"% Cryptophyta",
Indicator=="cyanobacteria"~"% Cyanobacteria",
Indicator=="dinophyta"~"% Dinophyta",
Indicator=="AMMONIA_excursions"~"Ammonia",
Indicator=="DO_excursions"~"Dissolved oxygen",
Indicator=="NITRITE_excursions"~"Nitrite",
Indicator=="PH_excursions"~"pH",
Indicator=="Fully_support"~"All parameters",
Indicator=TRUE~Indicator)
)
if(names(vars.list)[n]=="Fishing use"){
plot1<-ggplot(table[table$Indicator=="All parameters",],aes(x=Category,y=Estimate.P)) +
geom_col()+
geom_errorbar(aes(ymin=LCB95Pct.P,ymax=UCB95Pct.P),width=0.2)+
theme(legend.position = "none")+
ylim(0,100)+
labs(y="Percent of Total",x="Fully supporting",title="Fishing Use Parameters")+
theme(axis.text.x = element_text(angle = 45,vjust = 1, hjust=1))
plot2<-ggplot(table[table$Indicator!="All parameters",],aes(x=Indicator,y=Estimate.P,fill=Category)) +
geom_bar(position="stack", stat="identity")+
ylim(0,100)+
labs(y="",x="Parameter")+
theme(axis.text.x = element_text(angle = 45,vjust = 1, hjust=1))+
scale_fill_grey()
plot1 <- set_panel_size(p = plot1, width = unit(4, "cm"), height = unit(4,"cm"))
plot2 <- set_panel_size(p = plot2, width = unit(6, "cm"), height = unit(4,"cm"))
gridExtra::grid.arrange(plot1,plot2,ncol=2)
}else{
plot<-ggplot(table,aes(x=Category,y=Estimate.P)) +
geom_col()+
geom_errorbar(aes(ymin=LCB95Pct.P,ymax=UCB95Pct.P),width=0.2)+
theme(legend.position = "none")+
facet_wrap(.~Indicator,scales = "free")+
ylim(0,100)+
labs(y="Percent of Total",x="Condition category",title=paste(names(vars.list)[n],"Parameters"))+
theme(axis.text.x = element_text(angle = 45,vjust = 1, hjust=1))
plot <- set_panel_size(p = plot, width = unit(4, "cm"), height = unit(4,"cm"))
gridExtra::grid.arrange(plot)
}
}
#list variables you are interested in and defined above
trophic <- c("CHLOROPHYLL_ug_L",
"NITROGEN_mg_L",
"PHOSPHORUS_mg_L",
"SECCHI_m",
"TRUE_COLOR",
"NP_ratio")
minerals <- c("CALCIUM_mg_L","ALKALINITY_mg_L")
salt <- c("CHLORIDE_mg_L")
in.situ <- c("DISSOLVED_OXYGEN_mg_L",
"PH_hypo",
"SPEC_CONDUCTANCE")
metals <- c("ARSENIC_BS_TOTAL",
"ARSENIC_OW_TOTAL",
"IRON_BS_TOTAL",
"IRON_OW_TOTAL",
"MAGNESIUM_BS_TOTAL",
"MAGNESIUM_OW_TOTAL")
habs <- c("CYANOBACTERIA")
vars.list <- list(Trophic=trophic,Minerals=minerals,Salt=salt,`In Situ`=in.situ,Metals=metals,HABs=habs)
n<-0
for(i in vars.list){
n<-n+1
#Creates 3 estimations in list: CDF, percentiles, means.
analysis <- cont_analysis(
dframe = att,
vars = i,
subpops = ,
siteID = "siteID",
weight = "WgtAdj",
xcoord = "xcoord",
ycoord = "ycoord")
table<-analysis$CDF %>%
select(Indicator,Value,Estimate.P,LCB95Pct.P,UCB95Pct.P) %>%
filter(Indicator %in% i) %>%
mutate(Indicator=case_when(
Indicator=="CHLOROPHYLL_ug_L"~"Chlorophyll-a (ug/L)",
Indicator=="NITROGEN_mg_L"~"Total nitrogen (mg/L)",
Indicator=="PHOSPHORUS_mg_L"~"Total phosphorus (mg/L)",
Indicator=="DISSOLVED_OXYGEN_mg_L"~"Dissolved oxygen (mg/L)",
Indicator=="ALKALINITY_mg_L"~"Alkalinity (mg/L)",
Indicator=="CHLORIDE_mg_L"~"Chloride (mg/L)",
Indicator=="CALCIUM_mg_L"~"Calcium (mg/L)",
Indicator=="SECCHI_m"~"Secchi disk depth (m)",
Indicator=="TRUE_COLOR"~"True color (PCU)",
Indicator=="SPEC_CONDUCTANCE"~"Conductance (mS/cm)",
Indicator=="PH_hypo"~"pH (SU)",
Indicator=="ARSENIC_BS_TOTAL"~"Arsenic, bottom (ug/L)",
Indicator=="ARSENIC_OW_TOTAL"~"Arsenic, open water (ug/L)",
Indicator=="IRON_BS_TOTAL"~"Iron, bottom (ug/L)",
Indicator=="IRON_OW_TOTAL"~"Iron, open water (ug/L)",
Indicator=="MAGNESIUM_BS_TOTAL"~"Magnesium, bottom (ug/L)",
Indicator=="MAGNESIUM_OW_TOTAL"~"Magnesium, open water (ug/L)",
Indicator=="NP_ratio"~"DIN:TP ratio",
Indicator=="CYANOBACTERIA"~"% Cyanobacteria",
Indicator=TRUE~Indicator
)) %>%
mutate(Lower=case_when(
Indicator=="Chlorophyll-a (ug/L)"~2,
Indicator=="Total phosphorus (mg/L)"~0.01,
Indicator=="Specific conductance (mS/cm)"~125,
Indicator=="Alkalinity (mg/L)"~60,
Indicator=="Calcium (mg/L)"~10,
Indicator=="True color (PCU)"~10,
Indicator=="Secchi disk depth (m)"~2,
Indicator=TRUE~as.numeric(""))) %>%
mutate(Upper=case_when(
Indicator=="Chlorophyll-a (ug/L)"~8,
Indicator=="Total phosphorus (mg/L)"~0.02,
Indicator=="Specific conductance (mS/cm)"~250,
Indicator=="Alkalinity (mg/L)"~120,
Indicator=="Calcium (mg/L)"~20,
Indicator=="True color (PCU)"~20,
Indicator=="Secchi disk depth (m)"~5,
Indicator=TRUE~as.numeric("")))
plot<-ggplot(table,aes(x=Value,y=Estimate.P,ymin=LCB95Pct.P,ymax=UCB95Pct.P))+
geom_line()+
geom_point()+
geom_ribbon(alpha=0.5)+
ylim(0,100)+
guides(fill="none",shape="none")+
facet_wrap(.~Indicator, scales="free")+
geom_vline(aes(xintercept=Lower),linetype="dashed")+
geom_vline(aes(xintercept=Upper),linetype="dashed")+
labs(y="Percent of lakes",title=paste(names(vars.list[n]),"Parameters"))+
scale_color_grey()
plot <- set_panel_size(p = plot, width = unit(4, "cm"), height = unit(4,"cm"))
gridExtra::grid.arrange(plot)
}
Here I make the comparisons to NLA and NH data from above, as well as to historical LMAS sampling. Thresholds for NLA, NH and NY comparisons are from the NLA and are “good”, “fair”, and “poor” for chlorophyll-a, phosphorus, nitrogen and epilimnetic dissolved oxygen. There are no thresholds for chloride.
The values for LMAS sampling are derived from a separate script that counts the number of lakes in each size class from samples in the last 10 years.
## During execution of the program, 3 warning messages were generated. The warning
## messages are stored in a data frame named 'warn_df'. Enter the following
## command to view the warning messages: warnprnt()
## To view a subset of the warning messages (say, messages number 1, 3, and 5),
## enter the following command: warnprnt(m=c(1,3,5))
## During execution of the program, a warning message was generated. The warning
## message is stored in a data frame named 'warn_df'. Enter the following command
## to view the warning message: warnprnt()
## During execution of the program, 10 warning messages were generated. The warning
## messages are stored in a data frame named 'warn_df'. Enter the following
## command to view the warning messages: warnprnt()
## To view a subset of the warning messages (say, messages number 1, 3, and 5),
## enter the following command: warnprnt(m=c(1,3,5))
cats <- rbind(ny.cat,nh.cat,nla.cat,nap.cat)
plot <- ggplot(cats,aes(x=Category,y=Estimate.P,color=Study,shape=Study))+
geom_point(position=position_dodge(width=0.2))+
ylim(0,100)+
geom_errorbar(aes(ymin=LCB95Pct.P, ymax=UCB95Pct.P), width=.25, position=position_dodge(width=0.2))+
facet_wrap(.~Indicator, scales="free")+
labs(title="Condition category extent estimates")
plot <- set_panel_size(p = plot, width = unit(4, "cm"), height = unit(4,"cm"))
gridExtra::grid.arrange(plot)
## During execution of the program, a warning message was generated. The warning
## message is stored in a data frame named 'warn_df'. Enter the following command
## to view the warning message: warnprnt()
## During execution of the program, a warning message was generated. The warning
## message is stored in a data frame named 'warn_df'. Enter the following command
## to view the warning message: warnprnt()
#list variables you are interested in and defined above
trophic <- c("CHLOROPHYLL_ug_L",
"NITROGEN_mg_L",
"PHOSPHORUS_ug_L",
"SECCHI_m",
"TRUE_COLOR",
"NP_ratio")
minerals <- c("CALCIUM_mg_L","ALKALINITY_mg_L")
salt <- c("CHLORIDE_mg_L")
in.situ <- c("DISSOLVED_OXYGEN_mg_L",
"PH_hypo",
"SPEC_CONDUCTANCE")
metals <- c("ARSENIC_BS_TOTAL",
"ARSENIC_OW_TOTAL",
"IRON_BS_TOTAL",
"IRON_OW_TOTAL",
"MAGNESIUM_BS_TOTAL",
"MAGNESIUM_OW_TOTAL")
vars.list <- list(Trophic=trophic,Minerals=minerals,Salt=salt,`In Situ`=in.situ,Metals=metals)
n<-0
for(i in vars.list){
n<-n+1
table<-all %>%
filter(Indicator %in% i) %>%
mutate(Indicator=case_when(
Indicator=="CHLOROPHYLL_ug_L"~"Chlorophyll-a (ug/L)",
Indicator=="NITROGEN_mg_L"~"Total nitrogen (mg/L)",
Indicator=="PHOSPHORUS_ug_L"~"Total phosphorus (ug/L)",
Indicator=="DISSOLVED_OXYGEN_mg_L"~"Dissolved oxygen (mg/L)",
Indicator=="ALKALINITY_mg_L"~"Alkalinity (mg/L)",
Indicator=="CHLORIDE_mg_L"~"Chloride (mg/L)",
Indicator=="CALCIUM_mg_L"~"Calcium (mg/L)",
Indicator=="SECCHI_m"~"Secchi disk depth (m)",
Indicator=="TRUE_COLOR"~"True color (PCU)",
Indicator=="SPEC_CONDUCTANCE"~"Conductance (mS/cm)",
Indicator=="PH_hypo"~"pH (SU)",
Indicator=="ARSENIC_BS_TOTAL"~"Arsenic, bottom (ug/L)",
Indicator=="ARSENIC_OW_TOTAL"~"Arsenic, open water (ug/L)",
Indicator=="IRON_BS_TOTAL"~"Iron, bottom (ug/L)",
Indicator=="IRON_OW_TOTAL"~"Iron, open water (ug/L)",
Indicator=="MAGNESIUM_BS_TOTAL"~"Magnesium, bottom (ug/L)",
Indicator=="MAGNESIUM_OW_TOTAL"~"Magnesium, open water (ug/L)",
Indicator=="DOC_mg_L"~"Dissolved organic carbon (mg/L)",
Indicator=="NP_ratio"~"DIN:TP ratio",
Indicator=TRUE~Indicator
)) %>%
filter((Indicator=="Chlorophyll-a (ug/L)"&Value<50)|
(Indicator=="Chloride (mg/L)"&Value<250)|
(Indicator=="Total nitrogen (mg/L)"&Value<2.5)|
(Indicator=="Total phosphorus (ug/L)"&Value<150)|
(Indicator=="Dissolved organic carbon (mg/L)"&Value<50)|
(Indicator=="Calcium (mg/L)"&Value<50)|
(Indicator=="Conductance (mS/cm)" & Value<1000)|
(Indicator=="Magnesium, open water (ug/L)"&Value<25000)|
!(Indicator %in% c("Chlorophyll-a (ug/L)","Chloride (mg/L)","Total nitrogen (mg/L)", "Total phosphorus (ug/L)","Dissolved organic carbon (mg/L)","Calcium (mg/L)","Conductance (mS/cm)","Magnesium, open water (ug/L)"))) %>%
mutate(Study=factor(Study, levels=c("NY","NLA","NAP","NH")))
plot <- ggplot(table,aes(x=Value,y=Estimate.P,color=Study,shape=Study,linetype=Study))+
geom_line()+
ylim(0,100)+
scale_linetype_manual(values=c("solid","dotdash", "dotted","dashed"))+
facet_wrap(.~Indicator, scales="free")+
labs(y="Percent of lakes",title=paste(names(vars.list[n]),"Parameters"))
plot <- set_panel_size(p = plot, width = unit(4, "cm"), height = unit(4,"cm"))
gridExtra::grid.arrange(plot)
}
The plot below shows the distribution of the size classes and trophic classes included in the data frame. For comparison, I’ve plotted the proportion of NYS lakes sampled in each size class since 2010.
#Plotting data frame distribution as well as NYS sample distribution collected since 2010
#LMAS numbers retrieved in 2020
frame<-att %>%
select(PROB_CAT,AREA_CAT) %>%
rename(size_category=AREA_CAT) %>%
distinct() %>%
mutate(Probability_Sites = case_when( #statewide estimate
endsWith(size_category, "(1,4]") ~ ((1828/6437)*100),
endsWith(size_category, "(4,10]") ~ ((2490/6437)*100),
endsWith(size_category, "(10,20]") ~ ((1003/6437)*100),
endsWith(size_category, "(20,50]") ~ ((616/6437)*100),
endsWith(size_category, ">50") ~ ((500/6437)*100))) %>%
mutate(LMAS_Sites = case_when(
endsWith(size_category, "(1,4]") ~ (7.64),
endsWith(size_category, "(4,10]") ~ (16.7),
endsWith(size_category, "(10,20]") ~ (16.2),
endsWith(size_category, "(20,50]") ~ (19.3),
endsWith(size_category, ">50") ~ (40.1))) %>%
rename("Statewide"=Probability_Sites,
"LMAS"=LMAS_Sites) %>%
ungroup() %>%
select(-PROB_CAT) %>%
gather(study,percentage,-size_category) %>%
arrange(study,size_category) %>%
mutate(percentage=as.numeric(percentage),
size_category=factor(size_category,levels=c("(1,4]","(4,10]","(10,20]","(20,50]",">50")))
plot <- ggplot(frame, aes(x=size_category,y=percentage,fill=study)) +
geom_bar(stat = "identity", position = position_dodge())+
labs(title="Size classes",y="Percent of Total",x="Size category (hectares)")+
scale_fill_grey()+
theme(legend.position = "none")
plot1 <- set_panel_size(p = plot, width = unit(6, "cm"), height = unit(4,"cm"))
forplot<-probability.cat %>%
mutate(study="Statewide") %>%
add_row(Indicator="Chlorophyll-a",
Category="Eutrophic",
study="LMAS",
Estimate.P=((215/473)*100)) %>%
add_row(Indicator="Chlorophyll-a",
Category="Mesotrophic",
study="LMAS",
Estimate.P=((182/473)*100)) %>%
add_row(Indicator="Chlorophyll-a",
Category="Oligotrophic",
study="LMAS",
Estimate.P=((76/473)*100)) %>%
add_row(Indicator="Phosphorus",
Category="Eutrophic",
study="LMAS",
Estimate.P=((211/473)*100)) %>%
add_row(Indicator="Phosphorus",
Category="Mesotrophic",
study="LMAS",
Estimate.P=((141/473)*100)) %>%
add_row(Indicator="Phosphorus",
Category="Oligotrophic",
study="LMAS",
Estimate.P=((121/473)*100)) %>%
add_row(Indicator="Secchi",
Category="Eutrophic",
study="LMAS",
Estimate.P=((371/700)*100)) %>%
add_row(Indicator="Secchi",
Category="Mesotrophic",
study="LMAS",
Estimate.P=((279/700)*100)) %>%
add_row(Indicator="Secchi",
Category="Oligotrophic",
study="LMAS",
Estimate.P=((50/700)*100)) %>%
mutate(LCB95Pct.P=ifelse(is.na(LCB95Pct.P),Estimate.P,LCB95Pct.P),
UCB95Pct.P=ifelse(is.na(UCB95Pct.P),Estimate.P,UCB95Pct.P))
plot <- ggplot(forplot, aes(x=Category,fill=study)) +
geom_bar_pattern(stat = "identity", position = position_dodge(),
aes(y=Estimate.P,
pattern_type = study,
pattern_fill = study),
pattern= 'magick',
fill= 'white',
colour= 'black',
pattern_scale = 0.5,
pattern_key_scale_factor = 1.1)+
geom_errorbar(aes(ymin=LCB95Pct.P, ymax=UCB95Pct.P), width=0.2,
position=position_dodge(.9))+
facet_wrap(~Indicator,scales = "free")+
ylim(0,100)+
labs(title="Trophic Classes across NYS versus LMAS samples since 2010",y="Percent of Total",x="Trophic Classes")+
theme(axis.text.x = element_text(angle = 45,vjust = 1, hjust=1),
legend.key.height = unit(1, 'cm'),
legend.key.width = unit(1, 'cm'))+
scale_pattern_type_discrete(choices = c("gray15","bricks"))
# plot <- set_panel_size(p = plot, width = unit(4, "cm"), height = unit(4,"cm"))
#
# gridExtra::grid.arrange(plot)
plot2 <- ggplot(forplot[forplot$Indicator=="Phosphorus",], aes(x=Category,fill=study,y=Estimate.P)) +
geom_bar(stat = "identity", position = position_dodge())+
geom_errorbar(aes(ymin=LCB95Pct.P, ymax=UCB95Pct.P), width=0.2,
position=position_dodge(.9))+
labs(title="Phosphorus",x="Trophic class",y="")+
theme(axis.text.x = element_text(angle = 45,vjust = 1, hjust=1),
legend.key.height = unit(1, 'cm'),
legend.key.width = unit(1, 'cm'))+
scale_fill_grey()
plot2 <- set_panel_size(p = plot2, width = unit(4, "cm"), height = unit(4,"cm"))
gridExtra::grid.arrange(plot1, plot2,ncol=2)
sessionInfo(package=NULL)
## R version 4.1.2 (2021-11-01)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur 10.16
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] grid stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] maps_3.4.0 ggmap_3.0.0 spsurvey_5.0.1 survey_4.1-1
## [5] survival_3.2-13 Matrix_1.3-4 sf_1.0-3 ggpattern_0.4.2
## [9] egg_0.4.5 gridExtra_2.3 lubridate_1.8.0 huxtable_5.4.0
## [13] forcats_0.5.1 stringr_1.4.0 dplyr_1.0.7 purrr_0.3.4
## [17] readr_2.0.2 tidyr_1.1.4 tibble_3.1.6 ggplot2_3.3.5
## [21] tidyverse_1.3.1
##
## loaded via a namespace (and not attached):
## [1] nlme_3.1-153 bitops_1.0-7 fs_1.5.0
## [4] httr_1.4.2 tools_4.1.2 backports_1.3.0
## [7] bslib_0.3.1 utf8_1.2.2 R6_2.5.1
## [10] KernSmooth_2.23-20 AlgDesign_1.2.0 DBI_1.1.1
## [13] colorspace_2.0-2 sp_1.4-5 withr_2.4.2
## [16] tidyselect_1.1.1 compiler_4.1.2 cli_3.1.0
## [19] rvest_1.0.2 xml2_1.3.2 labeling_0.4.2
## [22] sass_0.4.0 scales_1.1.1 classInt_0.4-3
## [25] proxy_0.4-26 commonmark_1.7 crossdes_1.1-1
## [28] digest_0.6.28 minqa_1.2.4 rmarkdown_2.11
## [31] jpeg_0.1-9 pkgconfig_2.0.3 htmltools_0.5.2
## [34] lme4_1.1-27.1 highr_0.9 dbplyr_2.1.1
## [37] fastmap_1.1.0 rlang_0.4.12 readxl_1.3.1
## [40] rstudioapi_0.13 farver_2.1.0 jquerylib_0.1.4
## [43] generics_0.1.1 jsonlite_1.7.2 gtools_3.9.2
## [46] magrittr_2.0.1 Rcpp_1.0.7 munsell_0.5.0
## [49] fansi_0.5.0 lifecycle_1.0.1 stringi_1.7.5
## [52] yaml_2.2.1 MASS_7.3-54 plyr_1.8.6
## [55] crayon_1.4.2 deldir_1.0-6 lattice_0.20-45
## [58] haven_2.4.3 splines_4.1.2 hms_1.1.1
## [61] knitr_1.36 pillar_1.6.4 rjson_0.2.20
## [64] boot_1.3-28 reprex_2.0.1 glue_1.5.0
## [67] evaluate_0.14 mitools_2.4 modelr_0.1.8
## [70] png_0.1-7 vctrs_0.3.8 nloptr_1.2.2.3
## [73] tzdb_0.2.0 RgoogleMaps_1.4.5.3 cellranger_1.1.0
## [76] gtable_0.3.0 assertthat_0.2.1 xfun_0.28
## [79] broom_0.7.10 e1071_1.7-9 class_7.3-19
## [82] units_0.7-2 ellipsis_0.3.2
end.time <- Sys.time()
elapsed.time <- round((end.time - start.time), 3)
print(elapsed.time)
## Time difference of 15.012 mins